This file simulates the data I anticipate for the coordinated analysis that will be my dissertation.
metadata <- tibble(
study = c("new_moms", "deception_detection", "karyn_diss", "murat_rep",
"mideast_men", "stem", "barter", "double_empathy"),
targets = c(20, 95, 212, 200,
9, 59, 310, 8),
perceivers = c(60, 95, 212, 200,
326, 121, 310, 100),
videos = c(20, 95, 318, 300,
9, 121, 155, 8),
paradigm = c("ss", "di", "di", "di",
"ss", "di", "di", "ss") %>%
factor(levels = c("ss", "di"), labels = c("Standard Stimulus",
"Dyadic Interaction")),
inference_schedule = c("Variable", "Variable", "Set", "Set",
"Variable", "Set", "Variable", "Set") %>%
as.factor()
)
vrm <- c("Disclosure", "Edification", "Advisement", "Confirmation", "Question", "Acknowledgment", "Interpretation", "Reflection")
generate_random_number <- function(mean = 8, sd = 3, min = 3, max = 19, digits = 0) {
random_number <- NA
while (is.na(random_number) || random_number < min || random_number > max) {
random_number <- round(rnorm(1, mean = mean, sd = sd), digits)
}
return(random_number)
}
multiply_out <- function(df, n_column, column_name) {
df_expanded <- df %>%
rowwise() %>%
mutate(!!column_name := list(seq_len(!!sym(n_column)))) %>%
unnest(cols = !!sym(column_name))
return(df_expanded)
}
SimulateStudy <- function(study_name, paradigm, seed = 123, n_perceivers = 1, n_videos_per_perceiver = 1){
set.seed(seed)
# Filter for current study
study_data <- metadata %>%
filter(study == study_name)
# Simulate number of chapters within each video
df = tibble(
name = paste0(study_name, "_", 1:study_data$videos),
n_video = 1:study_data$videos,
n_chapter = NA
)
for(i in seq_len(study_data$videos)){
df$n_chapter[i] <- generate_random_number()
}
df <- multiply_out(df, n_column = "n_chapter", column_name = "chapter")
# Simulate number of turns within each chapter
for(i in seq_len(study_data$videos)){
df$n_turns[i] <- generate_random_number(mean = 11, sd = 6,
min = 4, max = 40)
}
df <- multiply_out(df, n_column = "n_turns", column_name = "turn")
# STIMULUS LEVEL VARIABLES
df <- df %>%
group_by(name, chapter) %>%
mutate(
chapter_length = generate_random_number(mean = 45, sd = 6,
min = 18, max = 120,
digits = 3),
turn_length = {raw_turn_lengths <- runif(n(), min = 4, max = 40)
scaled_turn_lengths <- raw_turn_lengths / sum(raw_turn_lengths) *
chapter_length
round(scaled_turn_lengths, 3)
},
start_time = cumsum(lag(turn_length, default = 0)),
end_time = cumsum(turn_length),
turns_from_inference = n() - row_number() + 1,
turn_percent_through_chapter = (row_number() / n()) * 100,
time_percent_through_chapter = end_time/chapter_length * 100,
speaker = ifelse(rep(sample(c(TRUE, FALSE), 1), n()),
rep(c("Partner", "Target"), length.out = n()),
rep(c("Target", "Partner"), length.out = n())) %>%
factor(),
sem_sim = {
repeat {
base_random <- runif(n(), min = -1.00, max = 1.00)
weight <- ifelse(speaker == "Partner",
((turn_percent_through_chapter - 1) / 180)^2,
((turn_percent_through_chapter - 1) / 120)^2)
noise <- ifelse(speaker == "Partner",
rnorm(n(), mean = 0, sd = 0.3),
rnorm(n(), mean = 0, sd = 0.1))
sem_sim_raw <- base_random * (1 - weight) + 1 * weight + noise
if (sum(sem_sim_raw <= -0.99 | sem_sim_raw >= 0.99) / n() < 0.05) {
break
}
}
pmin(pmax(sem_sim_raw, -1.00), 1.00)
},
vrm = sample(vrm, n(), replace = TRUE)
)
# PARTICIPANT-LEVEL VARIABLES
if(paradigm == "DI"){
df <- df %>%
mutate(
target = paste0(name, "_target_", n_video),
perceiver = paste0(name, "_perceiver_", n_video),
partner = paste0(name, "_partner_", n_video),
paradigm = "Dyadic Interaction"
)
} else if (paradigm == "SS"){
# have to double-up on the naming because nesting removes the grouping column
df <- df %>%
mutate(
name2 = name
)
df_list <- df %>%
group_by(name) %>%
nest()
out_list <- list()
for(i in seq_len(n_perceivers)){
df_i <- sample(df_list$data, n_videos_per_perceiver) %>%
bind_rows()
df_i <- df_i %>%
mutate(
target = paste0(name2, "_target_", n_video),
perceiver = paste0(name2, "_perceiver_", i),
partner = paste0(name2, "_partner_", n_video),
paradigm = "Standard Stimulus"
)
out_list[[i]] <- df_i
}
df <- bind_rows(out_list)
df$name <- df$name2
df <- df %>%
dplyr::select(-name2)
}
return(df)
}
df <- list(
stem = SimulateStudy("stem", paradigm = "DI"),
barter = SimulateStudy("barter", paradigm = "DI"),
deception_detection = SimulateStudy("deception_detection", paradigm = "DI"),
new_moms = SimulateStudy("new_moms",
paradigm = "SS",
n_perceivers = 3,
n_videos_per_perceiver = 3),
karyn_diss = SimulateStudy("karyn_diss",
paradigm = "SS",
n_perceivers = 212,
n_videos_per_perceiver = 3),
murat_rep = SimulateStudy("karyn_diss",
paradigm = "SS",
n_perceivers = 200,
n_videos_per_perceiver = 3),
mideast_men = SimulateStudy("mideast_men",
paradigm = "SS",
n_perceivers = 326,
n_videos_per_perceiver = 4),
double_empathy = SimulateStudy("double_empathy",
paradigm = "SS",
n_perceivers = 100,
n_videos_per_perceiver = 4)
) %>%
bind_rows() %>%
ungroup()
## Warning: Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
df <- df %>%
mutate(across(where(is.character), factor))
set.seed(123)
language_mat <- c(
1.00, 0.50, 0.60, 0.40, 0.45, 0.55, -0.20, -0.25, -0.15, 0.30,
0.50, 1.00, 0.45, 0.35, 0.30, 0.40, -0.10, -0.15, -0.05, 0.25,
0.60, 0.45, 1.00, 0.50, 0.40, 0.35, -0.25, -0.20, -0.10, 0.35,
0.40, 0.35, 0.50, 1.00, 0.25, 0.30, 0.10, 0.15, 0.20, 0.10,
0.45, 0.30, 0.40, 0.25, 1.00, 0.50, -0.15, -0.10, -0.05, 0.40,
0.55, 0.40, 0.35, 0.30, 0.50, 1.00, -0.20, -0.15, -0.10, 0.30,
-0.20, -0.10, -0.25, 0.10, -0.15, -0.20, 1.00, 0.60, 0.50, -0.30,
-0.25, -0.15, -0.20, 0.15, -0.10, -0.15, 0.60, 1.00, 0.50, -0.40,
-0.15, -0.05, -0.10, 0.20, -0.05, -0.10, 0.50, 0.50, 1.00, -0.35,
0.30, 0.25, 0.35, 0.10, 0.40, 0.30, -0.30, -0.40, -0.35, 1.00)
language_noise <- rnorm(length(language_mat), mean = -.03, sd = .01)
language_mat <- language_mat + language_noise
# Independent matrix
language_vecs <- rnorm_multi(n = nrow(df),
mu = c(cog_processing_language = 4,
memory_language = 2,
certain_language = 2,
self_ref_language = 10,
curious_language = 1,
perception_language = 2,
emo_anxious = 1,
emo_sad = 2,
emo_anger = 0.5,
emo_positive = 1.5),
sd = c(3, 4, 4, 2, 10,
10, 15, 15, 15, 4),
r = language_mat
)
df$cog_processing_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$cog_processing_language),
mu = 4,
sd = 3,
r = c(0.2, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$memory_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$memory_language),
mu = 2,
sd = 4,
r = c(0.10, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$certain_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$certain_language),
mu = 2,
sd = 4,
r = c(0.12, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$self_ref_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$self_ref_language),
mu = 10,
sd = 2,
r = c(0.51, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$curious_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$curious_language),
mu = 1,
sd = 10,
r = c(0.29, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$perception_language <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$perception_language),
mu = 2,
sd = 10,
r = c(0.40, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_anxious <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$emo_anxious),
mu = 1,
sd = 15,
r = c(0.14, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_sad <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$emo_sad),
mu = 2,
sd = 15,
r = c(0.13, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_anger <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$emo_anger),
mu = 0.5,
sd = 15,
r = c(0.19, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
df$emo_positive <- rnorm_pre(
data.frame(df$sem_sim, language_vecs$emo_positive),
mu = 1.5,
sd = 4,
r = c(0.15, 0.8) # Replace with the desired correlation
) %>% pmax(., 0)
avg_data <- df %>%
group_by(turn_percent_through_chapter) %>%
summarize(sem_sim = mean(sem_sim), .groups = "drop")
ggplot(df, aes(x = (turn_percent_through_chapter), y = sem_sim)) +
geom_line(aes(group = perceiver), color = "gray",
alpha = 0.01, size = 0.5) +
geom_hline(aes(yintercept = 0), color = "black") +
geom_smooth(data = avg_data, aes(x = turn_percent_through_chapter,
y = sem_sim),
method = "loess", se = FALSE, color = "black") +
labs(
title = "Turn Distance from Inference by Semantic Similarity",
x = "Proximity to Inference",
y = "Semantic Similarity",
color = "Perceiver"
) +
papaja::theme_apa(
base_family = "Times New Roman"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# Create explicit grouping factors
df$target_partner <- interaction(df$target, df$partner, drop = TRUE)
df$chapter_target_partner <- interaction(df$chapter, df$target_partner, drop = TRUE)
# Fit the multilevel model
model <- lmer(
sem_sim ~ turns_from_inference + paradigm +
(1 | perceiver / target_partner / chapter_target_partner),
data = df
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
# Summarize the model
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## sem_sim ~ turns_from_inference + paradigm + (1 | perceiver/target_partner/chapter_target_partner)
## Data: df
##
## REML criterion at convergence: 433071.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.72504 -0.76812 0.06118 0.79077 2.67138
##
## Random effects:
## Groups Name Variance
## chapter_target_partner:(target_partner:perceiver) (Intercept) 0.0036517
## target_partner:perceiver (Intercept) 0.0002721
## perceiver (Intercept) 0.0004179
## Residual 0.2303103
## Std.Dev.
## 0.06043
## 0.01650
## 0.02044
## 0.47991
## Number of obs: 312572, groups:
## chapter_target_partner:(target_partner:perceiver), 22340; target_partner:perceiver, 2720; perceiver, 2720
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.3950263 0.0033924 116.44
## turns_from_inference -0.0291351 0.0002044 -142.56
## paradigmStandard Stimulus -0.0518567 0.0031991 -16.21
##
## Correlation of Fixed Effects:
## (Intr) trns__
## trns_frm_nf -0.508
## prdgmStndrS -0.857 0.138
## optimizer (nloptwrap) convergence code: 0 (OK)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
avg_data <- df %>%
group_by(turn_percent_through_chapter, speaker) %>%
summarize(sem_sim = mean(sem_sim), .groups = "drop")
ggplot(df, aes(x = turn_percent_through_chapter, y = sem_sim)) +
geom_line(aes(group = perceiver, color = speaker),
alpha = 0.005, size = 0.5) +
scale_color_manual(
values = c("Partner" = "red", "Target" = "blue"),
name = "Speaker"
) +
geom_hline(aes(yintercept = 0), color = "black") +
# Separate average lines for Target and Partner
geom_smooth(data = avg_data %>% filter(speaker == "Target"),
aes(x = turn_percent_through_chapter, y = sem_sim),
method = "loess", se = FALSE, color = "red") +
geom_smooth(data = avg_data %>% filter(speaker == "Partner"),
aes(x = turn_percent_through_chapter, y = sem_sim),
method = "loess", se = FALSE, color = "blue") +
labs(
title = "Turn Distance from Inference by Semantic Similarity",
x = "Proximity to Inference",
y = "Semantic Similarity",
color = "Speaker"
) +
papaja::theme_apa(
base_family = "Times New Roman"
) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction <- function(df, variable, var_name = "VARIABLE", subtitle = FALSE, subtitle_text = NA){
plot <- ggplot(df, aes(x = sem_sim, y = !!sym(variable))) +
geom_point(color = "black", alpha = 0.01, size = 0.5) +
geom_smooth(color = "black", method = "lm", se = TRUE) +
theme_apa(base_family = "Times New Roman") +
labs(
title = paste0("Correlation Between\n ",
var_name, " and Semantic Similarity"),
x = "Semantic Similarity",
y = var_name,
caption = paste0("Correlation = ",
round(cor(df["sem_sim"], df[variable]), 2))
)
if (subtitle) {
plot <- plot + labs(subtitle = subtitle_text)
}
return(plot)
}
LanguageFigsFunction(df, variable = "cog_processing_language", "Cognitive Processing Language",
subtitle = TRUE, subtitle_text = "e.g., know, think, cause")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "memory_language", "Memory Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "certain_language", "Certainty Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "self_ref_language", "Self-Referential Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "curious_language", "Curiousity Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "perception_language", "Perception Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_anxious", "Anxious Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_sad", "Sad Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_anger", "Anger Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_positive", "Positive Emotions")
## `geom_smooth()` using formula = 'y ~ x'
cormat <- df %>%
select(sem_sim,
cog_processing_language,
memory_language,
certain_language,
self_ref_language,
curious_language,
perception_language,
emo_anxious,
emo_sad,
emo_anger,
emo_positive) %>%
rename("Semantic Similarity" = sem_sim,
'Cognitive Processing' = cog_processing_language,
'Memory' = memory_language,
'Certainty' = certain_language,
'Self-Referential' = self_ref_language,
'Curiousity' = curious_language,
'Perception'= perception_language,
'Anxious' = emo_anxious,
'Sad' = emo_sad,
'Anger' = emo_anger,
'Positive Emotions' = emo_positive
) %>%
cor()
cormat_melt <- melt(cormat)
ggplot(cormat_melt, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() + # Heatmap tiles
geom_text(aes(label = format(round(value, 2), nsmall = 2)), color = "black", size = 3) + # Overlay correlation values
scale_fill_gradient2(
low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = "Correlation"
) +
labs(
title = "Heatmap of Correlations",
x = NULL,
y = NULL
) +
theme_apa() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) # Rotate x-axis labels